In [1]:
#Import Libraries:
import numpy as np # for data processing.
import pandas as pd # is an extended version of "np".
import matplotlib.pyplot as plt # for data visualization.
import seaborn as sns # is an extended version of "plt".
import plotly.express as px # for more appealing interactive data visualizations.
import warnings # for ignoring warnings.
warnings.filterwarnings('ignore')
In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import TomekLinks
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import SGDClassifier
from minepy import MINE
In [3]:
df = pd.read_csv("dataset/Employee_Attrition.csv")
df.head(10).style.background_gradient(cmap="Reds")
Out[3]:
| Â | Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| 5 | 32 | No | Travel_Frequently | 1005 | Research & Development | 2 | 2 | Life Sciences | 1 | 8 | 4 | Male | 79 | 3 | 1 | Laboratory Technician | 4 | Single | 3068 | 11864 | 0 | Y | No | 13 | 3 | 3 | 80 | 0 | 8 | 2 | 2 | 7 | 7 | 3 | 6 |
| 6 | 59 | No | Travel_Rarely | 1324 | Research & Development | 3 | 3 | Medical | 1 | 10 | 3 | Female | 81 | 4 | 1 | Laboratory Technician | 1 | Married | 2670 | 9964 | 4 | Y | Yes | 20 | 4 | 1 | 80 | 3 | 12 | 3 | 2 | 1 | 0 | 0 | 0 |
| 7 | 30 | No | Travel_Rarely | 1358 | Research & Development | 24 | 1 | Life Sciences | 1 | 11 | 4 | Male | 67 | 3 | 1 | Laboratory Technician | 3 | Divorced | 2693 | 13335 | 1 | Y | No | 22 | 4 | 2 | 80 | 1 | 1 | 2 | 3 | 1 | 0 | 0 | 0 |
| 8 | 38 | No | Travel_Frequently | 216 | Research & Development | 23 | 3 | Life Sciences | 1 | 12 | 4 | Male | 44 | 2 | 3 | Manufacturing Director | 3 | Single | 9526 | 8787 | 0 | Y | No | 21 | 4 | 2 | 80 | 0 | 10 | 2 | 3 | 9 | 7 | 1 | 8 |
| 9 | 36 | No | Travel_Rarely | 1299 | Research & Development | 27 | 3 | Medical | 1 | 13 | 3 | Male | 94 | 3 | 2 | Healthcare Representative | 3 | Married | 5237 | 16577 | 6 | Y | No | 13 | 3 | 2 | 80 | 2 | 17 | 3 | 2 | 7 | 7 | 7 | 7 |
In [4]:
c=df.corr(numeric_only=True)[["DailyRate","Age"]]
c
Out[4]:
| DailyRate | Age | |
|---|---|---|
| Age | 0.010661 | 1.000000 |
| DailyRate | 1.000000 | 0.010661 |
| DistanceFromHome | -0.004985 | -0.001686 |
| Education | -0.016806 | 0.208034 |
| EmployeeCount | NaN | NaN |
| EmployeeNumber | -0.050990 | -0.010145 |
| EnvironmentSatisfaction | 0.018355 | 0.010146 |
| HourlyRate | 0.023381 | 0.024287 |
| JobInvolvement | 0.046135 | 0.029820 |
| JobLevel | 0.002966 | 0.509604 |
| JobSatisfaction | 0.030571 | -0.004892 |
| MonthlyIncome | 0.007707 | 0.497855 |
| MonthlyRate | -0.032182 | 0.028051 |
| NumCompaniesWorked | 0.038153 | 0.299635 |
| PercentSalaryHike | 0.022704 | 0.003634 |
| PerformanceRating | 0.000473 | 0.001904 |
| RelationshipSatisfaction | 0.007846 | 0.053535 |
| StandardHours | NaN | NaN |
| StockOptionLevel | 0.042143 | 0.037510 |
| TotalWorkingYears | 0.014515 | 0.680381 |
| TrainingTimesLastYear | 0.002453 | -0.019621 |
| WorkLifeBalance | -0.037848 | -0.021490 |
| YearsAtCompany | -0.034055 | 0.311309 |
| YearsInCurrentRole | 0.009932 | 0.212901 |
| YearsSinceLastPromotion | -0.033229 | 0.216513 |
| YearsWithCurrManager | -0.026363 | 0.202089 |
In [5]:
plt.figure(figsize=(10,10))
sns.heatmap(c,annot=True,fmt=".2",cmap="Reds")
plt.title("correlation between DailyRate and Age",c="k",fontsize=17)
plt.show()
In [6]:
sns.countplot(x="Education",data=df)
plt.title("Count Of Education Number",c="k",fontsize=17)
plt.xlabel("Education",c="g",fontsize=16)
plt.ylabel("Count",c="r",fontsize=16)
plt.show()
In [7]:
sns.countplot(x="EducationField",data=df)
plt.title("Count Of EducationField Number",c="k",fontsize=17)
plt.xlabel("EducationField",c="g",fontsize=16)
plt.ylabel("count",c="r",fontsize=16)
plt.show()
In [8]:
df.hist(bins=30,figsize=(16,16))
plt.show()
In [9]:
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(numeric_only=True),annot=True,fmt=".1")
plt.title("Correlation Between Columns",c="k",fontsize=25)
plt.show()
In [10]:
sns.catplot(x="Education",y="Age",data=df,kind="box")
plt.title("Boxplot For Education",c="k",fontsize=20)
plt.xlabel("Education",c="r",fontsize=16)
plt.ylabel("Age",c="g",fontsize=16)
plt.show()
In [11]:
sns.pairplot(df,vars=["Age","DailyRate"])
plt.show()
In [12]:
plt.scatter(x=df["TotalWorkingYears"],y=df["DailyRate"],c="r")
plt.title("Distribution 2 Columns by Scatter",c="k",fontsize=20)
plt.xlabel("TotalWorkingYears",c="y",fontsize=16)
plt.ylabel("DailyRate",c="g",fontsize=16)
Out[12]:
Text(0, 0.5, 'DailyRate')
In [13]:
plt.scatter(x=df["Education"],y=df["Age"],c="b")
plt.title("Distribution 2 Columns by Scatter",c="k",fontsize=20)
plt.xlabel("Education",c="y",fontsize=16)
plt.ylabel("Age",c="g",fontsize=16)
Out[13]:
Text(0, 0.5, 'Age')
In [14]:
plt.figure(figsize=(8,5))
df["Age"].plot()
df["Education"].plot()
plt.title("Distribution Age and Education",c="k",fontsize=20)
plt.xlabel("None",fontsize=16,c="r")
plt.ylabel("None",fontsize=16,c="g")
plt.legend(["Age","DailyRate"])
plt.tight_layout()
plt.show()
In [15]:
sns.barplot(data=df,x="EducationField",y="Education")
plt.title("Distribution of barplot",c="k",fontsize=20)
plt.xlabel("EducationField",c="r",fontsize=16)
plt.ylabel("Education",c="g",fontsize=16)
plt.show()
In [16]:
plt.figure(figsize=(7,7))
df.Attrition.value_counts().plot(kind="bar")
plt.show()
In [17]:
df.groupby("EmployeeNumber")["EmployeeCount"].mean().sort_values().head(8).plot(kind="bar")
plt.title("Distribution of groupby EmployeeCount and EmployeeNumber",c="k",fontsize=20)
plt.xlabel("EmployeeNumber",c="r",fontsize=16)
plt.show()
In [18]:
plt.figure(figsize=(11,11))
columns=df.columns[23:34]
for i,column in enumerate(columns,start=1):
plt.subplot(5,4,i)
sns.boxplot(x=df[column],color="y")
plt.tight_layout()
plt.show()
In [19]:
plt.figure(figsize=(15,15))
columns=df.columns[23:30]
for i,column in enumerate(columns,start=1):
plt.subplot(7,1,i)
sns.distplot(x=df[column],color="b")
plt.tight_layout()
plt.show()
In [20]:
fig=px.histogram(df,x="Education",color="Education")
fig.show()
In [21]:
fig=px.histogram(df,x="Attrition",color="Attrition")
fig.show()
In [22]:
fig=px.histogram(df,x="EducationField",color="EducationField")
fig.show()
In [23]:
sns.catplot(x="Attrition",y="Age",data=df,kind="box")
plt.title("Boxplot For Attrition",c="k",fontsize=20)
plt.xlabel("Attrition",c="r",fontsize=16)
plt.ylabel("Age",c="g",fontsize=16)
plt.show()
In [24]:
b=df.boxplot(column="Age",by="Education",figsize=(10,6))
plt.title("Plotting boxplot to check outliers",fontsize=20,c="k")
plt.xlabel("Education",fontsize=16,c="g")
plt.ylabel("outliers",fontsize=16,c="r")
b
Out[24]:
<Axes: title={'center': 'Plotting boxplot to check outliers'}, xlabel='Education', ylabel='outliers'>
In [25]:
sns.pairplot(df,vars=["Education","EducationField"])
plt.show()
In [26]:
plt.figure(figsize=(12,5))
df["Age"].plot(c="r")
plt.title("Distribution Age",c="k",fontsize=20)
plt.xlabel("None",fontsize=16,c="r")
plt.ylabel("None",fontsize=16,c="g")
plt.legend(["Age"])
plt.tight_layout()
plt.show()
In [27]:
sns.catplot(x="Education",
y="EducationField",
hue="Attrition",
data=df,
kind="bar")
plt.show()
In [28]:
sns.jointplot(x="Age",y="DailyRate",data=df)
plt.show()
Data Preprocessing
In [29]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1470 entries, 0 to 1469 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1470 non-null int64 1 Attrition 1470 non-null object 2 BusinessTravel 1470 non-null object 3 DailyRate 1470 non-null int64 4 Department 1470 non-null object 5 DistanceFromHome 1470 non-null int64 6 Education 1470 non-null int64 7 EducationField 1470 non-null object 8 EmployeeCount 1470 non-null int64 9 EmployeeNumber 1470 non-null int64 10 EnvironmentSatisfaction 1470 non-null int64 11 Gender 1470 non-null object 12 HourlyRate 1470 non-null int64 13 JobInvolvement 1470 non-null int64 14 JobLevel 1470 non-null int64 15 JobRole 1470 non-null object 16 JobSatisfaction 1470 non-null int64 17 MaritalStatus 1470 non-null object 18 MonthlyIncome 1470 non-null int64 19 MonthlyRate 1470 non-null int64 20 NumCompaniesWorked 1470 non-null int64 21 Over18 1470 non-null object 22 OverTime 1470 non-null object 23 PercentSalaryHike 1470 non-null int64 24 PerformanceRating 1470 non-null int64 25 RelationshipSatisfaction 1470 non-null int64 26 StandardHours 1470 non-null int64 27 StockOptionLevel 1470 non-null int64 28 TotalWorkingYears 1470 non-null int64 29 TrainingTimesLastYear 1470 non-null int64 30 WorkLifeBalance 1470 non-null int64 31 YearsAtCompany 1470 non-null int64 32 YearsInCurrentRole 1470 non-null int64 33 YearsSinceLastPromotion 1470 non-null int64 34 YearsWithCurrManager 1470 non-null int64 dtypes: int64(26), object(9) memory usage: 402.1+ KB
In [30]:
df.shape
Out[30]:
(1470, 35)
In [31]:
df.isna().sum().sum()
Out[31]:
0
In [32]:
df.describe()
Out[32]:
| Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | ... | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 |
| mean | 36.923810 | 802.485714 | 9.192517 | 2.912925 | 1.0 | 1024.865306 | 2.721769 | 65.891156 | 2.729932 | 2.063946 | ... | 2.712245 | 80.0 | 0.793878 | 11.279592 | 2.799320 | 2.761224 | 7.008163 | 4.229252 | 2.187755 | 4.123129 |
| std | 9.135373 | 403.509100 | 8.106864 | 1.024165 | 0.0 | 602.024335 | 1.093082 | 20.329428 | 0.711561 | 1.106940 | ... | 1.081209 | 0.0 | 0.852077 | 7.780782 | 1.289271 | 0.706476 | 6.126525 | 3.623137 | 3.222430 | 3.568136 |
| min | 18.000000 | 102.000000 | 1.000000 | 1.000000 | 1.0 | 1.000000 | 1.000000 | 30.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 80.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 30.000000 | 465.000000 | 2.000000 | 2.000000 | 1.0 | 491.250000 | 2.000000 | 48.000000 | 2.000000 | 1.000000 | ... | 2.000000 | 80.0 | 0.000000 | 6.000000 | 2.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 |
| 50% | 36.000000 | 802.000000 | 7.000000 | 3.000000 | 1.0 | 1020.500000 | 3.000000 | 66.000000 | 3.000000 | 2.000000 | ... | 3.000000 | 80.0 | 1.000000 | 10.000000 | 3.000000 | 3.000000 | 5.000000 | 3.000000 | 1.000000 | 3.000000 |
| 75% | 43.000000 | 1157.000000 | 14.000000 | 4.000000 | 1.0 | 1555.750000 | 4.000000 | 83.750000 | 3.000000 | 3.000000 | ... | 4.000000 | 80.0 | 1.000000 | 15.000000 | 3.000000 | 3.000000 | 9.000000 | 7.000000 | 3.000000 | 7.000000 |
| max | 60.000000 | 1499.000000 | 29.000000 | 5.000000 | 1.0 | 2068.000000 | 4.000000 | 100.000000 | 4.000000 | 5.000000 | ... | 4.000000 | 80.0 | 3.000000 | 40.000000 | 6.000000 | 4.000000 | 40.000000 | 18.000000 | 15.000000 | 17.000000 |
8 rows × 26 columns
In [33]:
categoriacal_cols = []
for cols in df.columns:
if(df[cols].dtype == 'object'):
categoriacal_cols.append(cols)
print(categoriacal_cols)
print(len(categoriacal_cols))
['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime'] 9
In [34]:
#Unique Values in each categorical columns in the dataset
unique_counts = df[categoriacal_cols].nunique()
print("Number of unique values in each categorical column:")
print(unique_counts)
Number of unique values in each categorical column: Attrition 2 BusinessTravel 3 Department 3 EducationField 6 Gender 2 JobRole 9 MaritalStatus 3 Over18 1 OverTime 2 dtype: int64
In [35]:
from sklearn.preprocessing import LabelEncoder
# Create a copy of the DataFrame to avoid modifying the original one
df_label_encoded = df.copy()
# Initialize the LabelEncoder
label_encoder = LabelEncoder()
# Apply LabelEncoder to each categorical column
for col in categoriacal_cols:
df_label_encoded[col] = label_encoder.fit_transform(df_label_encoded[col])
df_label_encoded.head()
Out[35]:
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 1 | 2 | 1102 | 2 | 1 | 2 | 1 | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | 0 | 1 | 279 | 1 | 8 | 1 | 1 | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | 1 | 2 | 1373 | 1 | 2 | 2 | 4 | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | 0 | 1 | 1392 | 1 | 3 | 4 | 1 | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | 0 | 2 | 591 | 1 | 2 | 1 | 3 | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
In [36]:
## split data to x , y
x=df.drop("Attrition",axis=1)
y=df["Attrition"]
In [37]:
y.value_counts()
Out[37]:
Attrition No 1233 Yes 237 Name: count, dtype: int64
In [38]:
df['Attrition'].value_counts().plot(kind='barh', figsize=(8,6))
plt.xlabel("Count", labelpad=15)
plt.ylabel("Attrition", labelpad=15)
plt.title("Count of the Y")
Out[38]:
Text(0.5, 1.0, 'Count of the Y')
In [39]:
# List of columns with more than 50% missing values
def columns_with_missing_data(df):
missing_data_threshold = len(df) * 0.5 # 50% threshold
missing_columns = []
for column in df.columns:
if df[column].isnull().sum() > missing_data_threshold:
missing_columns.append(column)
return missing_columns
missing_columns = columns_with_missing_data(df)
print("Columns with more than 50% missing values:")
print(missing_columns)
Columns with more than 50% missing values: []
In [40]:
from ydata_profiling import ProfileReport
ProfileReport(df)
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]